
#########################################################################
############                GET CHILD DATASET               #############
#########################################################################

library(haven)
library(readr)
library(data.table)
library(scales)

filepathimdb="Derived Data/"
panelraw <- fread(paste(filepathimdb,"child_year_panel2025.csv",sep=""), 
               select=c("MAIN_PARENT", "PARENT2", "PARENT1", "IMDB_ID", "Year", "LANDING_AGE", "BIRTH_YEAR", 
                        "Child_EmpInc_IND", "Child_EmpInc_HH", "Child_TIRC_IND", "Child_TIRC_HH", 
                        "Parent1_TIRC_HH", "Parent2_TIRC_HH", "Parent1_TIRC_IND","Parent2_TIRC_IND",
                        "MainParent_EmpInc_HH", "MainParent_TIRC_IND", "MainParent_TIRC_HH","Child_IMM_STAT","CHILD_EMIG_DATE",
                        "MainParent_IMM_STAT","MainParent_EMIG_DATE","Parent1_IMM_STAT","Parent1_EMIG_DATE",
                        #"Parent2_IMM_STAT","Parent2_EMIG_DATE",
                        #"Parent3_IMM_STAT","Parent3_EMIG_DATE",
                        #"Parent4_IMM_STAT","Parent4_EMIG_DATE",
                        "MainParent_NAIC","Parent1_NAIC","Parent2_NAIC")
              )

panel=panelraw
# get raw data about parental emigration for the main parent
length(unique(panel$MAIN_PARENT))
length(unique(panel$MAIN_PARENT[panel$MainParent_IMM_STAT>1]))

# 1/0 for emigrate : valid emigration date in tax record and not missing
panel$mainparent_emig_true=ifelse(panel$MainParent_EMIG_DATE>0 & (!is.na(panel$MainParent_EMIG_DATE)),1,0)



# Drop children without parents:
panel <- panel[!(MAIN_PARENT=="")]

# Impute negative income to missing
panel[panel<0] <- NA

# Need to deflate to real income (2020$)
cpi <- read_dta("H:/Zheng_10223/Joint/cpi/cpi.dta")
panel <- merge(x=panel, y=cpi, by.x="Year", by.y="year", all.x=T)
panel[,c("Child_EmpInc_IND", "Child_TIRC_IND", "Child_TIRC_HH")] <- panel[,c("Child_EmpInc_IND", "Child_TIRC_IND", "Child_TIRC_HH")]*cpi$cpi[which(cpi$year==2020)]/panel$cpi
panel[,c( "Parent1_TIRC_IND", "Parent1_TIRC_HH")] <- panel[,c( "Parent1_TIRC_IND", "Parent1_TIRC_HH")]*cpi$cpi[which(cpi$year==2020)]/panel$cpi
panel[,c( "Parent2_TIRC_IND", "Parent2_TIRC_HH")] <- panel[,c( "Parent2_TIRC_IND", "Parent2_TIRC_HH")]*cpi$cpi[which(cpi$year==2020)]/panel$cpi
panel[,c( "MainParent_TIRC_IND", "MainParent_TIRC_HH")] <- panel[,c("MainParent_TIRC_IND", "MainParent_TIRC_HH")]*cpi$cpi[which(cpi$year==2020)]/panel$cpi



panel$Child_EmpInc_IND[which(panel$Child_EmpInc_IND>=quantile(panel$Child_EmpInc_IND, 0.999, na.rm=T))] <- quantile(panel$Child_EmpInc_IND, 0.999, na.rm=T)
panel$Child_TIRC_IND[which(panel$Child_TIRC_IND>=quantile(panel$Child_TIRC_IND, 0.999, na.rm=T))] <- quantile(panel$Child_TIRC_IND, 0.999, na.rm=T)
panel$Child_TIRC_HH[which(panel$Child_TIRC_HH>=quantile(panel$Child_TIRC_HH, 0.999, na.rm=T))] <- quantile(panel$Child_TIRC_HH, 0.999, na.rm=T)
panel$MainParent_TIRC_HH[which(panel$MainParent_TIRC_HH>=quantile(panel$MainParent_TIRC_HH, 0.999, na.rm=T))] <- quantile(panel$MainParent_TIRC_HH, 0.999, na.rm=T)
panel$Parent1_TIRC_HH[which(panel$Parent1_TIRC_HH>=quantile(panel$Parent1_TIRC_HH, 0.999, na.rm=T))] <- quantile(panel$Parent1_TIRC_HH, 0.999, na.rm=T)
panel$Parent2_TIRC_HH[which(panel$Parent2_TIRC_HH>=quantile(panel$Parent2_TIRC_HH, 0.999, na.rm=T))] <- quantile(panel$Parent2_TIRC_HH, 0.999, na.rm=T)
panel$MainParent_TIRC_IND[which(panel$MainParent_TIRC_IND>=quantile(panel$MainParent_TIRC_IND, 0.999, na.rm=T))] <- quantile(panel$MainParent_TIRC_IND, 0.999, na.rm=T)
panel$Parent1_TIRC_IND[which(panel$Parent1_TIRC_IND>=quantile(panel$Parent1_TIRC_IND, 0.999, na.rm=T))] <- quantile(panel$Parent1_TIRC_IND, 0.999, na.rm=T)
panel$Parent2_TIRC_IND[which(panel$Parent2_TIRC_IND>=quantile(panel$Parent2_TIRC_IND, 0.999, na.rm=T))] <- quantile(panel$Parent2_TIRC_IND, 0.999, na.rm=T)

# Get basic landing information for main parent
landing <- read_dta("G:/IMDB_AllYears/rdc/IMDB_BDIM_2022_v1/data_donnees/stata/Core_IMDB/pnrf_1980_2022_f3_v1.dta")
landing <- data.table(landing)

# merge in parent landing info ( note: BIRTH_YEAR in "panel" is child birth year )
panel <- merge(x=panel, y=landing[,c("IMDB_ID", "LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR", "CITIZEN_YEAR","BIRTH_YEAR","gender")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
setnames(panel, old=c("BIRTH_YEAR.x","BIRTH_YEAR.y","LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR", "CITIZEN_YEAR","gender"), new=c("BIRTH_YEAR_child","BirthYear_MainParent","LandingYear_MainParent", "FIRST_TAX_YEAR_MainParent", "LAST_TAX_YEAR_MainParent", "DEATH_YEAR_MainParent", "CITIZEN_YEAR_MainParent","gender_MainParent"))

# get basic landing information for parent1 and parent 2 including gender to get fathers
panel <- merge (x=panel, y=landing[,c("IMDB_ID", "LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR", "BIRTH_YEAR","gender")], by.x="PARENT1", by.y="IMDB_ID", all.x=T)
setnames(panel, old=c("BIRTH_YEAR","LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR","gender"), new=c("BirthYear_Parent1","LandingYear_PARENT1", "FIRST_TAX_YEAR_PARENT1", "LAST_TAX_YEAR_PARENT1", "DEATH_YEAR_PARENT1","gender_parent1"))

panel <- merge (x=panel, y=landing[,c("IMDB_ID", "LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR", "BIRTH_YEAR","gender")], by.x="PARENT2", by.y="IMDB_ID", all.x=T)
setnames(panel, old=c("BIRTH_YEAR","LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR","gender"), new=c("BirthYear_Parent2","LandingYear_PARENT2", "FIRST_TAX_YEAR_PARENT2", "LAST_TAX_YEAR_PARENT2", "DEATH_YEAR_PARENT2","gender_parent2"))


### Attempt to follow Chetty et al. 2014 where missing tax returns are imputed to zero
# Problem with imputing missing to zero is that a ton of children and parents never file a tax return
# Some children may have left the country (e.g., go to US for school)
# Solution: 
# i) Drop if child do not file AT LEAST ONE tax return between ages 30-34
# ii) Drop if parents do not file AT LEAST ONE tax returns within 10 years of landing
# This effectively means that any child or parent who is alive and filed at least one tax return between ages 30-34 OR within 10Y of landing, 
# will get missing tax returns imputed as zero. IF instead they do not fill a tax return following the above criteria, then they are dropped
panel[, `:=` (Child_AnyTax30_34=max(!is.na(Child_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(30,31,32,33,34))])),
              Parent_AnyTaxPost10=max(!is.na(MainParent_TIRC_HH[which((Year-LandingYear_MainParent) %in% c(0:10))]))), by="IMDB_ID"]
panel <- panel[Child_AnyTax30_34==1 & Parent_AnyTaxPost10==1]
track_drop$Number[track_drop$Sample=="C"]=length(unique(panel$IMDB_ID))

# Replace missing tax returns (provided between landing and last year) with zero
panel[, `:=` (MinDeathLeave_MainParent=pmin(DEATH_YEAR_MainParent, LAST_TAX_YEAR_MainParent, na.rm=T)), by="IMDB_ID"] # get the min between death year and last tax year





setnafill(panel, cols=c("MainParent_TIRC_HH", "Parent1_TIRC_HH", "Parent2_TIRC_HH"), fill=0)
panel[Year<FIRST_TAX_YEAR_MainParent | Year>MinDeathLeave_MainParent, c("MainParent_TIRC_HH", "Parent1_TIRC_HH", "Parent2_TIRC_HH")] <- NA

# Children: 
panel <- merge(x=panel, y=landing[,c("IMDB_ID", "DEATH_YEAR")], by.x="IMDB_ID", by.y="IMDB_ID", all.x=T)
setnames(panel, old=c("DEATH_YEAR"), new="DEATH_YEAR_Child")



setnafill(panel, cols=c("Child_TIRC_HH", "Child_EmpInc_IND"), fill=0)
panel[Year>DEATH_YEAR_Child, c("Child_EmpInc_IND", "Child_TIRC_IND", "Child_TIRC_HH")] <- NA


# Spouse income
panel$Child_TIRC_spouse=panel$Child_TIRC_HH - panel$Child_TIRC_IND


# clean NAIC:The code "NNN" represents people not associated to a T4 slip and "UUU" means missing NAICS information for the business.
panel$MainParent_NAIC[panel$MainParent_NAIC=="NNN"]=NA
panel$MainParent_NAIC=as.numeric(panel$MainParent_NAIC)

# Aggregate to child-level dataset with relevant child and parent incomes
sample <- panel[, .(
  # Child average total household income at 30-34
  Child_Income_HH_30_34=mean(Child_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(30,31,32,33,34))], na.rm=T),
  
  # Child average total individual income at 30-34
  Child_Income_IND_30_34=mean(Child_TIRC_IND[which((Year-BIRTH_YEAR_child) %in% c(30,31,32,33,34))], na.rm=T),
  
  # Child average individual earnings at 30-34
  Child_EmpInc_IND_30_34=mean(Child_EmpInc_IND[which((Year-BIRTH_YEAR_child) %in% c(30,31,32,33,34))], na.rm=T),
  
  
  # Child spouse income when child is 30-34
  Child_Income_Spouse_30_34=mean(Child_TIRC_spouse[which((Year-BIRTH_YEAR_child) %in% c(30,31,32,33,34))], na.rm=T),
  
  
  # Main parent household total income when child is 15-19
  MainParent_Income_HH_15_19=mean(MainParent_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(15,16,17,18,19))], na.rm=T),
  
  # Main parent household total income when child is 10-19
  MainParent_Income_HH_10_19=mean(MainParent_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(10,11,12,13,14,15,16,17,18,19))], na.rm=T),
  
  # Main parent age 45-49 (following Connolly et al., though note that they do mothers only)
  MainParent_Income_HH_MainParentAge45_49=mean(MainParent_TIRC_HH[which((Year-BirthYear_MainParent) %in% c(45,46,47,48,49))], na.rm=T),
  
  
  # Main parent age 45-49 IND (following Connolly et al., though note that they do mothers only)
  MainParent_Income_IND_MainParentAge45_49=mean(MainParent_TIRC_IND[which((Year-BirthYear_MainParent) %in% c(45,46,47,48,49))], na.rm=T),
  
  
  
  # Top earner parent household total income when Parent2 is 12-16
  Parent1_Income_HH_15_19=mean(Parent1_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(15,16,17,18,19))], na.rm=T),
  Parent2_Income_HH_15_19=mean(Parent2_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(15,16,17,18,19))], na.rm=T),
  
  
  # Top earner parent /father household total income when the top earner parent is 45-49
  Parent1_Income_HH_45_49=mean(Parent1_TIRC_HH[which((Year-BirthYear_Parent1) %in% c(45,46,47,48,49))], na.rm=T),
  Parent2_Income_HH_45_49=mean(Parent2_TIRC_HH[which((Year-BirthYear_Parent2) %in% c(45,46,47,48,49))], na.rm=T),
  
  
  # Top earner parent /father IND total income when the top earner parent is 45-49
  Parent1_Income_IND_45_49=mean(Parent1_TIRC_IND[which((Year-BirthYear_Parent1) %in% c(45,46,47,48,49))], na.rm=T),
  Parent2_Income_IND_45_49=mean(Parent2_TIRC_IND[which((Year-BirthYear_Parent2) %in% c(45,46,47,48,49))], na.rm=T),
  
  
  # Main parent household total income 5 and 10 years after landing
  MainParent_Income_HH_PostLanding5=mean(MainParent_TIRC_HH[which((Year-LandingYear_MainParent) %in% c(3,4,5,6,7))], na.rm=T),
  MainParent_Income_HH_PostLanding10=mean(MainParent_TIRC_HH[which((Year-LandingYear_MainParent) %in% c(8,9,10,11,12))], na.rm=T),
  
  # create emigration indicator for if the parent emigrated before age 49 : if this is one then they have an emigration date from Canada before  age 50: checked
  MainParent_emigbefore_49=max(mainparent_emig_true[which((Year-BirthYear_MainParent) <50)],na.rm=T),
  
  # emigration indicator 10 year landing 
  MainParent_emig10yr=max(mainparent_emig_true[which((Year-LandingYear_MainParent) <10)],na.rm=T),
  MainParent_emig20yr=max(mainparent_emig_true[which((Year-LandingYear_MainParent) <20)],na.rm=T),
  MainParent_emig30yr=max(mainparent_emig_true[which((Year-LandingYear_MainParent) <30)],na.rm=T),
  
  # and mainparent industry code at 45
  MainParent_naic45=mean(MainParent_NAIC[which((Year-BirthYear_MainParent) ==45)],na.rm=T)
  
), by="IMDB_ID"]

  
# household Get top parent income and identity : use top parent definition between 45 -49 age
sample$TopParent_Income_HH_45_49 <- pmax(sample$Parent1_Income_HH_45_49, sample$Parent2_Income_HH_45_49)
sample$TopParent <- "PARENT1"
sample$TopParent[which(sample$TopParent_Income_HH_45_49==sample$Parent2_Income_HH_45_49)] <- "PARENT2"

# if no parent 1 or parent 2, then it's just the main parent
sample$TopParent[which(is.na(sample$TopParent_Income_HH_45_49))]="Main Parent"
sample$TopParent_Income_HH_45_49[which(is.na(sample$TopParent_Income_HH_45_49))]=sample$MainParent_Income_HH_MainParentAge45_49[which(is.na(sample$TopParent_Income_HH_45_49))]


# individual  Get top parent income and identity : use top parent definition between 45 -49 age
sample$TopParent_Income_IND_45_49 <- pmax(sample$Parent1_Income_IND_45_49, sample$Parent2_Income_IND_45_49)
sample$TopParent <- "PARENT1"
sample$TopParent[which(sample$TopParent_Income_IND_45_49==sample$Parent2_Income_IND_45_49)] <- "PARENT2"

# if no parent 1 or parent 2, then it's just the main parent
sample$TopParent[which(is.na(sample$TopParent_Income_IND_45_49))]="Main Parent"
sample$TopParent_Income_IND_45_49[which(is.na(sample$TopParent_Income_IND_45_49))]=sample$MainParent_Income_IND_MainParentAge45_49[which(is.na(sample$TopParent_Income_IND_45_49))]



### Get income percentile
child_landing <- read_dta("H:/Zheng_10223/Joint/child_pnrf_1980_2020_f3_v1.dta")
child_landing <- data.table(child_landing)
child_landing$MAIN_PARENT=child_landing$main_parent; child_landing$main_parent=NULL
child_landing$PARENT1=child_landing$Parent1; child_landing$Parent1=NULL
child_landing$PARENT2=child_landing$Parent2; child_landing$Parent2=NULL
child_landing$CHILD_LAST_TAX_YEAR=child_landing$child_last_tax_year
# Income percentiles from the LAD (at year-age-level)
income_pct <- read.csv(paste(filepathimdb,"LAD/finaldist.csv",sep=""), header=TRUE, stringsAsFactors = FALSE)
income_pct <- merge(x=income_pct, y=cpi, by.x="year", by.y="year", all.x=T)
income_pct$pctindTIRC_real <- income_pct$pctindTIRC*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct$pcthhTIRC_real <- income_pct$pcthhTIRC*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct$pctindearn_real <- income_pct$pctindearn*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct$pctparentTIRC_real <- income_pct$pctparentTIRC*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct <- data.table(income_pct)
income_pct <- income_pct[order(year, age, pct)]

# Get child and main parent landing year and landing age
sample <- merge(x=sample, y=child_landing[,c("IMDB_ID", "BIRTH_YEAR", "LANDING_YEAR", "LANDING_AGE", "PARENT1", "PARENT2", "MAIN_PARENT")], by.x="IMDB_ID", by.y="IMDB_ID")
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "LANDING_YEAR")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T) #Main parent's landing year
setnames(sample, old=c("LANDING_YEAR.x", "LANDING_YEAR.y", "BIRTH_YEAR"), new=c("LandingYear_Child", "LandingYear_MainParent", "BirthYear_Child"))

# get main parent last tax year
sample=merge(x=sample, y=landing[,c("IMDB_ID","LAST_TAX_YEAR")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T) 
setnames(sample, old=c("LAST_TAX_YEAR"), new=c("LAST_TAX_YEAR_MAINPARENT"))

# get child last tax year 
sample=merge(x=sample, y=child_landing[,c("IMDB_ID","CHILD_LAST_TAX_YEAR")], by.x="IMDB_ID", by.y="IMDB_ID", all.x=T) 



# Get birth year for main parent, parent 1, and parent 2 (latter 2 needed for top parent income)
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "BIRTH_YEAR")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)  
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "BIRTH_YEAR")], by.x="PARENT1", by.y="IMDB_ID", all.x=T) 
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "BIRTH_YEAR")], by.x="PARENT2", by.y="IMDB_ID", all.x=T) 
setnames(sample, old=c("BIRTH_YEAR.x", "BIRTH_YEAR.y", "BIRTH_YEAR"), new=c("BirthYear_MainParent", "BirthYear_Parent1", "BirthYear_Parent2"))


# Get gender for main parent, parent 1, and parent 2 (latter 2 needed for father income)
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "gender")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)  
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "gender")], by.x="PARENT1", by.y="IMDB_ID", all.x=T) 
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "gender")], by.x="PARENT2", by.y="IMDB_ID", all.x=T) 
setnames(sample, old=c("gender.x", "gender.y", "gender"), new=c("gender_MainParent", "gender_Parent1", "gender_Parent2"))

# Identify Father:
sample$Father=""
sample$Father[sample$gender_MainParent==1]="MAIN_PARENT"
sample$Father[sample$Father=="" & sample$gender_Parent2==1]="Parent2"
sample$Father[sample$Father=="" & sample$gender_Parent1==1]="Parent1"

sample$Father_Income_HH_45_49[sample$Father=="MAIN_PARENT"]=sample$MainParent_Income_HH_MainParentAge45_49[sample$Father=="MAIN_PARENT"]
sample$Father_Income_HH_45_49[sample$Father=="Parent1"]=sample$Parent1_Income_HH_45_49[sample$Father=="Parent1"]
sample$Father_Income_HH_45_49[sample$Father=="Parent2"]=sample$Parent2_Income_HH_45_49[sample$Father=="Parent2"]

# if no father, replace with main parent:
sample$Father_Income_HH_45_49[is.na(sample$Father_Income_HH_45_49)]=sample$MainParent_Income_HH_MainParentAge45_49[is.na(sample$Father_Income_HH_45_49)]


# Get relevant year and age needed for percentile at a year/age
sample$YearAt30 <- sample$BirthYear_Child+30; sample$YearAt10 <- sample$BirthYear_Child+10; sample$YearAt15 <- sample$BirthYear_Child+15; 
sample$MainParentYearat45 <- sample$BirthYear_MainParent+45
sample$MainParentAgeAt15 <- sample$YearAt15-sample$BirthYear_MainParent;
sample$MainParentAgeAt10 <- sample$YearAt10-sample$BirthYear_MainParent;
sample$Parent1AgeAt15 <- sample$YearAt15-sample$BirthYear_Parent1;
sample$Parent2AgeAt15 <- sample$YearAt15-sample$BirthYear_Parent2;


sample$Parent1Yearat45 <- sample$BirthYear_Parent1 + 45;
sample$Parent2Yearat45 <- sample$BirthYear_Parent2 + 45;


sample$TopParentYearAt45 <- NA
sample$TopParentYearAt45[which(sample$TopParent=="Main Parent")] <- sample$MainParentYearat45[which(sample$TopParent=="Main Parent")]

sample$TopParentYearAt45[which(sample$TopParent=="PARENT1")] <- sample$Parent1Yearat45[which(sample$TopParent=="PARENT1")]
sample$TopParentYearAt45[which(sample$TopParent=="PARENT2")] <- sample$Parent2Yearat45[which(sample$TopParent=="PARENT2")]
sample$YearPost10Parent <- sample$LandingYear_MainParent+10; sample$AgePost10Parent <- sample$YearPost10Parent-sample$BirthYear_MainParent
sample$YearPost5Parent <- sample$LandingYear_MainParent+5; sample$AgePost5Parent <- sample$YearPost5Parent-sample$BirthYear_MainParent

sample$FatherYearAt45=NA
sample$FatherYearAt45[which(sample$Father=="MAIN_PARENT")]<-sample$MainParentYearat45[which(sample$Father=="MAIN_PARENT")]
sample$FatherYearAt45[which(sample$Father=="Parent2")]<-sample$Parent2Yearat45[which(sample$Father=="Parent2")]

sample$FatherYearAt45[which(sample$Father=="Parent1")]<-sample$Parent1Yearat45[which(sample$Father=="Parent1")]


# Placeholders for income percentiles
sample$Child_Income_HH_30_34_pct <- NA 
sample$Child_Income_Spouse_30_34_pct <- NA 
sample$Child_Income_IND_30_34_pct <- NA 
sample$Child_EmpInc_IND_30_34_pct <- NA 
sample$MainParent_Income_HH_15_19_pct <- NA
sample$MainParent_Income_HH_10_19_pct<- NA
sample$TopParent_Income_HH_15_19_pct <- NA


sample$TopParent_Income_HH_45_49_pctparent <- NA
sample$TopParent_Income_IND_45_49_pct <- NA

sample$Father_Income_HH_45_49_pctparent <- NA


sample$MainParent_Income_HH_PostLanding5_pcthh <- NA # percentile in the household distribution
sample$MainParent_Income_HH_PostLanding10_pcthh <- NA # percentile in the household distribution
sample$MainParent_Income_HH_PostLanding5_pctparent <- NA # percentile in the parent distribution
sample$MainParent_Income_HH_PostLanding10_pctparent <- NA # percentile in the parent distribution
sample$MainParent_Income_HH_MainParentAge45_49_pctparent<- NA
sample$MainParent_Income_HH_MainParentAge45_49_pcthh<- NA

sample$MainParent_Income_IND_MainParentAge45_49_pct<- NA

# Get child income percentiles (using Canadian 30year old)
for(i in 1992:2019){
  
  # distribution of child household income in the HOUSEHOLD INCOME DISTRIBUTION
  sample$Child_Income_HH_30_34_pct[which(sample$YearAt30==i)] <- findInterval(sample$Child_Income_HH_30_34[which(sample$YearAt30==i)], vec=income_pct$pcthhTIRC_real[which(income_pct$year==i & income_pct$age==30)])
  sample$Child_Income_HH_30_34_pct[which(sample$YearAt30==i & sample$Child_Income_HH_30_34_pct==min(sample$Child_Income_HH_30_34_pct[which(sample$YearAt30==i)], na.rm=T))] <-0.5*(1+sample$Child_Income_HH_30_34_pct[which(sample$YearAt30==i & sample$Child_Income_HH_30_34_pct==min(sample$Child_Income_HH_30_34_pct[which(sample$YearAt30==i)], na.rm=T))])
  
  
  sample$Child_Income_Spouse_30_34_pct[which(sample$YearAt30==i)] <- findInterval(sample$Child_Income_Spouse_30_34[which(sample$YearAt30==i)], vec=income_pct$pctindTIRC_real[which(income_pct$year==i & income_pct$age==30)])
  sample$Child_Income_Spouse_30_34_pct[which(sample$YearAt30==i & sample$Child_Income_Spouse_30_34_pct==min(sample$Child_Income_Spouse_30_34_pct[which(sample$YearAt30==i)], na.rm=T))] <-0.5*(1+sample$Child_Income_Spouse_30_34_pct[which(sample$YearAt30==i & sample$Child_Income_Spouse_30_34_pct==min(sample$Child_Income_Spouse_30_34_pct[which(sample$YearAt30==i)], na.rm=T))])
  
  
  
  sample$Child_Income_IND_30_34_pct[which(sample$YearAt30==i)] <- findInterval(sample$Child_Income_IND_30_34[which(sample$YearAt30==i)], vec=income_pct$pctindTIRC_real[which(income_pct$year==i & income_pct$age==30)])
  sample$Child_Income_IND_30_34_pct[which(sample$YearAt30==i & sample$Child_Income_IND_30_34_pct==min(sample$Child_Income_IND_30_34_pct[which(sample$YearAt30==i)], na.rm=T))] <-0.5*(1+sample$Child_Income_IND_30_34_pct[which(sample$YearAt30==i & sample$Child_Income_IND_30_34_pct==min(sample$Child_Income_IND_30_34_pct[which(sample$YearAt30==i)], na.rm=T))])
  
  sample$Child_EmpInc_IND_30_34_pct[which(sample$YearAt30==i)] <- findInterval(sample$Child_EmpInc_IND_30_34[which(sample$YearAt30==i)], vec=income_pct$pctindearn_real[which(income_pct$year==i & income_pct$age==30)])
  sample$Child_EmpInc_IND_30_34_pct[which(sample$YearAt30==i & sample$Child_EmpInc_IND_30_34_pct==min(sample$Child_EmpInc_IND_30_34_pct[which(sample$YearAt30==i)], na.rm=T))] <-0.5*(1+sample$Child_EmpInc_IND_30_34_pct[which(sample$YearAt30==i & sample$Child_EmpInc_IND_30_34_pct==min(sample$Child_EmpInc_IND_30_34_pct[which(sample$YearAt30==i)], na.rm=T))])

  print(i)
}

# Get parent income percentiles (using Canadian percentiles of adults when child is age 16)
# Need to loop through year and age (e.g., year and age of parent when their child is 16, or 5 year after landing)

# LAD INCOME_PCT ONLY STARTS IN 1982 
for(j in 1982:2019){
  
  # Compare income with rank/percentile of average household income AMONG PARENTS
  

  sample$MainParent_Income_IND_MainParentAge45_49_pct[which(sample$MainParentYearat45==j)] <- findInterval(sample$MainParent_Income_IND_MainParentAge45_49[which(sample$MainParentYearat45==j)], vec=income_pct$pctindTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  sample$MainParent_Income_IND_MainParentAge45_49_pct[which(sample$MainParentYearat45==j  & sample$MainParent_Income_IND_MainParentAge45_49_pct==min(sample$MainParent_Income_IND_MainParentAge45_49_pct[which(sample$MainParentYearat45==j )], na.rm=T))] <-0.5*(1+sample$MainParent_Income_IND_MainParentAge45_49_pct[which(sample$MainParentYearat45==j & sample$MainParent_Income_IND_MainParentAge45_49_pct==min(sample$MainParent_Income_IND_MainParentAge45_49_pct[which(sample$MainParentYearat45==j )], na.rm=T))])
  
  
  sample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(sample$MainParentYearat45==j)] <- findInterval(sample$MainParent_Income_HH_MainParentAge45_49[which(sample$MainParentYearat45==j)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  sample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(sample$MainParentYearat45==j  & sample$MainParent_Income_HH_MainParentAge45_49_pctparent==min(sample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(sample$MainParentYearat45==j )], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(sample$MainParentYearat45==j & sample$MainParent_Income_HH_MainParentAge45_49_pctparent==min(sample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(sample$MainParentYearat45==j )], na.rm=T))])
  
  # For robustness, compare income with rank/percentile of average household income AMONG PARENTS
  sample$MainParent_Income_HH_MainParentAge45_49_pcthh[which(sample$MainParentYearat45==j)] <- findInterval(sample$MainParent_Income_HH_MainParentAge45_49[which(sample$MainParentYearat45==j)], vec=income_pct$pcthhTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  sample$MainParent_Income_HH_MainParentAge45_49_pcthh[which(sample$MainParentYearat45==j  & sample$MainParent_Income_HH_MainParentAge45_49_pcthh==min(sample$MainParent_Income_HH_MainParentAge45_49_pcthh[which(sample$MainParentYearat45==j )], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_MainParentAge45_49_pcthh[which(sample$MainParentYearat45==j & sample$MainParent_Income_HH_MainParentAge45_49_pcthh==min(sample$MainParent_Income_HH_MainParentAge45_49_pcthh[which(sample$MainParentYearat45==j )], na.rm=T))])

  
  # fathers
  sample$Father_Income_HH_45_49_pctparent[which(sample$FatherYearAt45==j)] <- findInterval(sample$Father_Income_HH_45_49[which(sample$FatherYearAt45==j)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  sample$Father_Income_HH_45_49_pctparent[which(sample$FatherYearAt45==j  & sample$Father_Income_HH_45_49_pctparent==min(sample$Father_Income_HH_45_49_pctparent[which(sample$FatherYearAt45==j )], na.rm=T))] <-0.5*(1+sample$Father_Income_HH_45_49_pctparent[which(sample$FatherYearAt45==j  & sample$Father_Income_HH_45_49_pctparent==min(sample$Father_Income_HH_45_49_pctparent[which(sample$FatherYearAt45==j )], na.rm=T))])

  
  # top parent
  sample$TopParent_Income_HH_45_49_pctparent[which(sample$TopParentYearAt45==j)] <- findInterval(sample$TopParent_Income_HH_45_49[which(sample$TopParentYearAt45==j)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  sample$TopParent_Income_HH_45_49_pctparent[which(sample$TopParentYearAt45==j  & sample$TopParent_Income_HH_45_49_pctparent==min(sample$TopParent_Income_HH_45_49_pctparent[which(sample$TopParentYearAt45==j )], na.rm=T))] <-0.5*(1+sample$TopParent_Income_HH_45_49_pctparent[which(sample$TopParentYearAt45==j & sample$TopParent_Income_HH_45_49_pctparent==min(sample$TopParent_Income_HH_45_49_pctparent[which(sample$TopParentYearAt45==j )], na.rm=T))])
  
  # top parent IND
  sample$TopParent_Income_IND_45_49_pct[which(sample$TopParentYearAt45==j)] <- findInterval(sample$TopParent_Income_IND_45_49[which(sample$TopParentYearAt45==j)], vec=income_pct$pctTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  sample$TopParent_Income_IND_45_49_pct[which(sample$TopParentYearAt45==j  & sample$TopParent_Income_IND_45_49_pct==min(sample$TopParent_Income_IND_45_49_pct[which(sample$TopParentYearAt45==j )], na.rm=T))] <-0.5*(1+sample$TopParent_Income_IND_45_49_pct[which(sample$TopParentYearAt45==j & sample$TopParent_Income_IND_45_49_pct==min(sample$TopParent_Income_IND_45_49_pct[which(sample$TopParentYearAt45==j )], na.rm=T))])
  
  for(k in 18:80){
    
    # chetty et al 2014 uses parent income distribution, thus default is to compare against average household income AMONG PARENTS
    sample$MainParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$MainParentAgeAt15==k)] <- findInterval(sample$MainParent_Income_HH_15_19[which(sample$YearAt15==j & sample$MainParentAgeAt15==k)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$MainParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$MainParentAgeAt15==k & sample$MainParent_Income_HH_15_19_pct==min(sample$MainParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$MainParentAgeAt15==k)], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$MainParentAgeAt15==k & sample$MainParent_Income_HH_15_19_pct==min(sample$MainParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$MainParentAgeAt15==k)], na.rm=T))])
    
    sample$MainParent_Income_HH_10_19_pct[which(sample$YearAt10==j & sample$MainParentAgeAt10==k)] <- findInterval(sample$MainParent_Income_HH_10_19[which(sample$YearAt10==j & sample$MainParentAgeAt10==k)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$MainParent_Income_HH_10_19_pct[which(sample$YearAt10==j & sample$MainParentAgeAt10==k & sample$MainParent_Income_HH_10_19_pct==min(sample$MainParent_Income_HH_10_19_pct[which(sample$YearAt10==j & sample$MainParentAgeAt10==k)], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_10_19_pct[which(sample$YearAt10==j & sample$MainParentAgeAt10==k & sample$MainParent_Income_HH_10_19_pct==min(sample$MainParent_Income_HH_10_19_pct[which(sample$YearAt10==j & sample$MainParentAgeAt10==k)], na.rm=T))])
    
    
    sample$TopParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$TopParentAgeAt15==k)] <- findInterval(sample$TopParent_Income_HH_15_19[which(sample$YearAt15==j & sample$TopParentAgeAt15==k)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$TopParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$TopParentAgeAt15==k & sample$TopParent_Income_HH_15_19_pct==min(sample$MainParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$TopParentAgeAt15==k)], na.rm=T))] <-0.5*(1+sample$TopParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$TopParentAgeAt15==k & sample$TopParent_Income_HH_15_19_pct==min(sample$TopParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$TopParentAgeAt15==k)], na.rm=T))])

    # Compare post landing income with rank/percentile of average household income AMONG PARENTS
    sample$MainParent_Income_HH_PostLanding5_pctparent[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)] <- findInterval(sample$MainParent_Income_HH_PostLanding5[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$MainParent_Income_HH_PostLanding5_pctparent[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k & sample$MainParent_Income_HH_PostLanding5_pctparent==min(sample$MainParent_Income_HH_PostLanding5_pctparent[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_PostLanding5_pctparent[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k & sample$MainParent_Income_HH_PostLanding5_pctparent==min(sample$MainParent_Income_HH_PostLanding5_pctparent[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)], na.rm=T))])
    
    sample$MainParent_Income_HH_PostLanding10_pctparent[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)] <- findInterval(sample$MainParent_Income_HH_PostLanding10[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$MainParent_Income_HH_PostLanding10_pctparent[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k & sample$MainParent_Income_HH_PostLanding10_pctparent==min(sample$MainParent_Income_HH_PostLanding10_pctparent[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_PostLanding10_pctparent[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k & sample$MainParent_Income_HH_PostLanding10_pctparent==min(sample$MainParent_Income_HH_PostLanding10_pctparent[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)], na.rm=T))])
  
    # For robustness try comparing against rank/percentile of average household income AMONG ALL HOUSEHOLDS   
    sample$MainParent_Income_HH_PostLanding5_pcthh[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)] <- findInterval(sample$MainParent_Income_HH_PostLanding5[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)], vec=income_pct$pcthhTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$MainParent_Income_HH_PostLanding5_pcthh[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k & sample$MainParent_Income_HH_PostLanding5_pcthh==min(sample$MainParent_Income_HH_PostLanding5_pcthh[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_PostLanding5_pcthh[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k & sample$MainParent_Income_HH_PostLanding5_pcthh==min(sample$MainParent_Income_HH_PostLanding5_pcthh[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)], na.rm=T))])
    
    sample$MainParent_Income_HH_PostLanding10_pcthh[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)] <- findInterval(sample$MainParent_Income_HH_PostLanding10[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)], vec=income_pct$pcthhTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$MainParent_Income_HH_PostLanding10_pcthh[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k & sample$MainParent_Income_HH_PostLanding10_pcthh==min(sample$MainParent_Income_HH_PostLanding10_pcthh[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_PostLanding10_pcthh[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k & sample$MainParent_Income_HH_PostLanding10_pcthh==min(sample$MainParent_Income_HH_PostLanding10_pcthh[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)], na.rm=T))])
  }
  print(j)
}


fwrite(sample, paste(filepathimdb,"IGM_dataset_2025.csv",sep=""))
# this file feeds into 1_SummaryStatistics.R 

